Chapter 3 Data Summary

rm(list=ls()) #clear environment
load("data/squirrels_data.Rdata")

3.1 Sample summary

Summary of sampled individuals and analysed faecal samples.

#number of samples
length(sample_metadata$sample)
[1] 190
#number of samples by species
sample_metadata %>%
  group_by(species) %>%
  summarise(n_samples = length(sample)) %>%
  tt()
tinytable_lk7cytxxn65kdlks49j6
species n_samples
Sciurus carolinensis 80
Sciurus vulgaris 110
#number of samples by species and sex
sample_metadata %>%
  group_by(species, sex) %>%
  summarise(n_samples = length(sample)) %>%
  tt()
tinytable_fw11outygxst69m6yuj2
species sex n_samples
Sciurus carolinensis F 44
Sciurus carolinensis M 36
Sciurus vulgaris F 63
Sciurus vulgaris M 47
#number of samples by species and development
sample_metadata %>%
  group_by(species, development) %>%
  summarise(n_samples = length(sample)) %>%
  tt()
tinytable_xyh9vpryqipot1b9w8x2
species development n_samples
Sciurus carolinensis Adult 71
Sciurus carolinensis Juvenile 2
Sciurus carolinensis Nursing 3
Sciurus carolinensis Pregnant 4
Sciurus vulgaris Adult 90
Sciurus vulgaris Juvenile 1
Sciurus vulgaris Nursing 8
Sciurus vulgaris Pregnant 11
#number of samples by species and type of area
sample_metadata %>%
  group_by(species,area_type) %>%
  summarise(n_samples = length(sample)) %>%
  tt()
tinytable_gys4xiuh6mq9h9uglruh
species area_type n_samples
Sciurus carolinensis rural 29
Sciurus carolinensis suburban 24
Sciurus carolinensis urban 27
Sciurus vulgaris rural 37
Sciurus vulgaris suburban 30
Sciurus vulgaris urban 43
#number of distinct squirrels
n_distinct(sample_metadata$animal)
[1] 108
#number of squirrels by species and type of area
sample_metadata %>%
  group_by(species,area_type) %>%
  summarise(distinct_squirrels = n_distinct(animal)) %>%
  tt()
tinytable_f552a9lmw8ar27gy640x
species area_type distinct_squirrels
Sciurus carolinensis rural 14
Sciurus carolinensis suburban 13
Sciurus carolinensis urban 18
Sciurus vulgaris rural 21
Sciurus vulgaris suburban 14
Sciurus vulgaris urban 28
#number of squirrels by species and season
sample_metadata %>%
  group_by(species,season) %>%
  summarise(distinct_squirrels = n_distinct(animal)) %>%
  tt()
tinytable_v3b2ranfx33swu56ma0m
species season distinct_squirrels
Sciurus carolinensis autumn 33
Sciurus carolinensis spring-summer 22
Sciurus carolinensis winter 25
Sciurus vulgaris autumn 39
Sciurus vulgaris spring-summer 38
Sciurus vulgaris winter 33
#n of analysed faecal samples
ncol(read_counts)
[1] 191

Geographical location of sampled red squirrel (light blue) and grey squirrel (pink) populations in Italy.

#Summarise for generating map
options(dplyr.summarise.inform = FALSE)
sample_metadata_summary <- sample_metadata %>%
  #Group by geography and count samples
  select(sample, latitude, longitude, country, species) %>%
  group_by(latitude, longitude, species) %>%
  summarize(count = n()) %>%
  ungroup()

italy <- map_data("world", region="italy") %>%
  summarise(long = mean(long), lat = mean(lat))

#plotting on map
sample_metadata_summary %>%
  ggplot(.) +
  #render map
  geom_map(
    data=map_data("world", region="italy"),
    map = map_data("world", region="italy"),
    aes(long, lat, map_id=region),
    color = "white", fill = "#cccccc", linewidth = 0.2
  ) +
  #render points
  geom_point(
    aes(x=longitude,y=latitude, color=species),
    alpha=0.5, shape=16) +
  #add general plot layout
  theme_minimal() +
  theme(legend.position = "none",
        axis.title.x=element_blank(),
        axis.title.y=element_blank()
  ) + coord_map("mercator")

3.2 Sequencing data summary

Total amount of sequencing data generated from the analysed samples.

#amount of discarded data (GB)
sum(round(((sample_metadata$metagenomic_bases+sample_metadata$host_bases)/
             (1-sample_metadata$bases_lost_fastp_percent))-
            (sample_metadata$metagenomic_bases+sample_metadata$host_bases)))/1000000000
[1] 63.90045
#amount of host data (GB)
sum(sample_metadata$host_bases)/1000000000
[1] 166.0275
#amount of metagenomic data (GB)
sum(sample_metadata$metagenomic_bases)/1000000000
[1] 786.1584
#amount of estimated prokaryotic data (singleM)
sum(sample_metadata$metagenomic_bases * sample_metadata$singlem_fraction)/1000000000
[1] 557.7475

Origin of DNA sequences obtained from each sample.

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
  left_join(sample_metadata, by = join_by(sample == sample))  %>%
  select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
  mutate(mags_bases = mags*146) %>%
  mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
  mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
  mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
  select(sample,mags_bases,unmapped_bases,host_bases,lowqual_bases)

mags_bases_mean <- sequence_fractions %>%
  mutate(mags_bases = mags_bases / 1000000000) %>%
  select(mags_bases) %>%
  pull() %>%
  mean()

sequence_fractions %>%
  pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
  mutate(value = value / 1000000000) %>%
  mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
  ggplot(., aes(x = sample, y = value, fill=fraction)) +
  geom_bar(position="stack", stat = "identity") +
  scale_fill_manual(values=c("#CCCCCC","#178a94","#ee8080","#d03161")) +
  geom_hline(yintercept = mags_bases_mean, linetype = "dashed", color = "black") +
  labs(x = "Samples", y = "Amount of data (GB)") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

Sequencing assessment: difference between mapping rate and estimated singleM proportion

# Estimated vs mapped prokaryotic fraction
sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
  left_join(sample_metadata, by = join_by(sample == sample))  %>%
  select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
  mutate(mags_bases = mags*146) %>%
  mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
  mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
  mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
  select(sample,mags_bases,unmapped_bases,host_bases,lowqual_bases)

singlem_table <- sequence_fractions %>%
  mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
  left_join(sample_metadata, by = join_by(sample == sample))  %>%
  mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
  select(sample,mags_proportion,singlem_proportion) %>%
  mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
  mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
  mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
  mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify


singlem_table %>%
  pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
  mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
  ggplot(., aes(x = value, y = sample, color=proportion)) +
  geom_line(aes(group = sample), color = "#f8a538") +
  geom_point() +
  scale_color_manual(values=c("#52e1e8","#876b53")) +
  theme_classic() +
  labs(y = "Samples", x = "Prokaryotic fraction (%)") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "right")

# Export difference b/w mags and singlem proportions to be used later in script 05-diversity_models
singlem <- sequence_fractions %>%
  mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
  left_join(sample_metadata, by = join_by(sample == sample))  %>%
  mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
  mutate(mags_singlem = mags_proportion/singlem_proportion) %>%
  mutate(est_mapp = ifelse(mags_singlem >= 1, 1, mags_singlem)) %>%
  select(sample,mags_singlem,est_mapp)

write.table(singlem, file = "data/singlem.csv", row.names = FALSE, dec = ".", sep = ";", 
              quote = FALSE)

3.3 MAGs summary

#number of MAGs
nrow(read_counts)
[1] 1687
#number of MAGs without species-level annotation (i.e., "new species")
genome_metadata %>%
  filter(species == "s__") %>%
  nrow()
[1] 1455
#number of phylums
genome_metadata %>%
  select(phylum) %>%
  unique() %>%
  pull() %>%
  length()
[1] 13